# Read plankton and images
plankton <- read_parquet("data/00.plankton_clean.parquet") # no need to use the X correction here
#images <- read_parquet("data/00.images_clean.parquet")
# List taxonomic groups
taxa <- plankton %>% select(taxon) %>% distinct() %>% pull(taxon) %>% sort()
# Drop unwanted groups
taxa <- setdiff(taxa, c("Collodaria_colonial", "Rhizaria"))
plankton <- plankton %>% filter(taxon %in% taxa)
# Convert ESD from px to mm
plankton <- plankton %>% mutate(esd = esd * 51 / 1000)Build an empirical interaction matrix constrained by size of organisms.
Read data
Explore ESD values per taxonomic group
# Compute median ESD for each group for a nice ordered plot
plankton <- plankton %>%
group_by(taxon) %>%
mutate(median_esd = median(esd)) %>%
ungroup()
# Save median ESD by plankton group
plankton_esd <- plankton %>% select(taxon, median_esd) %>% distinct() %>% arrange(taxon)
save(plankton_esd, file = "data/16.plankton_esd.Rdata")
# Plot ESD distribution per group
ggplot(plankton) +
geom_boxplot(aes(x = fct_reorder(taxon, median_esd), y = esd), outlier.size = 0.5) +
scale_y_log10() +
labs(x = "Taxon", y = "ESD (mm)") +
coord_flip() +
theme_classic()Explore predator:prey size ratios
For a predator to feed on a prey, it typically has to be larger than the prey. Actually, there is an optimal predator to prey size ratio (PPSR) for each predator.
Let’s start with a given taxon as predator, with a given optimal PPSR. Let’s choose another taxon as prey. To estimate how likely the predator is to feed on the prey, we can use size distributions to compute the probability that the PPSR falls in the optimal range. For this, we randomly select 10,000 organisms in the predator taxon and 10,000 organisms in the prey taxon. We then compute 10,000 PPSR values, and count how many of them fall in the optimal range.
We’ll try various PPSR ranges, centred on values from 2 to 10, on a log-scale:
[5; 20], centred on 10
[4; 16], centred on 8
[3; 12], centred on 6
[2; 8], centred on 4
[1; 4], centred on 2
# Set up parallel processing plan
plan(multisession, workers = 8)
# Number of objects to randomly sample in each group
n_sam <- 10000
# Pre-sample each taxon in advance and store results in a list
sampled_plankton <- plankton %>%
group_by(taxon) %>%
slice_sample(n = n_sam) %>%
ungroup() %>%
select(taxon, esd) %>%
group_by(taxon)
sampled_plankton <- sampled_plankton %>%
group_split() %>%
set_names(unlist(group_keys(sampled_plankton))) # Named list for easy access by taxon name
# Generate all possible predator-prey pairs and expand for each opt_pp range
taxa_pairs <- crossing(predator = taxa, prey = taxa) %>%
#filter(predator != prey) %>%
mutate(opt_pp = list(list(c(5, 20), c(4, 16), c(3, 12), c(2, 8), c(1, 4)))) %>%
unnest(opt_pp) # Flatten list to duplicate rows per each opt_pp range
# Create the plankton subsets for each predator-prey pair by pulling from pre-sampled data
taxa_pairs <- taxa_pairs %>%
mutate(
plankton_subset = map2(predator, prey, ~bind_rows(
sampled_plankton[[.x]] %>%
slice_sample(n = min(nrow(sampled_plankton[[.x]]), nrow(sampled_plankton[[.y]]))) %>%
mutate(taxon = paste0(taxon, "_pred")) %>%
mutate(id = 1:n()),
sampled_plankton[[.y]] %>%
slice_sample(n = min(nrow(sampled_plankton[[.x]]), nrow(sampled_plankton[[.y]]))) %>%
mutate(taxon = paste0(taxon, "_prey")) %>%
mutate(id = 1:n())
))
) %>%
# Add a flag for each taxon as predator or prey
mutate(
predator = paste0(predator, "_pred"),
prey = paste0(prey, "_prey")
)
# Helper function to calculate proportion in a single optimal range for a predator-prey pair
calc_prop_pp_in <- function(predator, prey, opt_pp_range, plankton_subset) {
plankton_subset %>%
pivot_wider(names_from = taxon, values_from = esd) %>%
mutate(
pp = .data[[predator]] / .data[[prey]],
pp_in = between(pp, opt_pp_range[1], opt_pp_range[2])
) %>%
summarise(prop_pp_in = mean(pp_in, na.rm = TRUE)) %>%
pull(prop_pp_in)
}
# Apply the function in parallel using future_pmap_dbl to handle each opt_pp range separately
prop_ppsr <- taxa_pairs %>%
mutate(
prop_pp_in = future_pmap_dbl(
list(predator, prey, opt_pp, plankton_subset),
calc_prop_pp_in
)
) %>%
select(predator, prey, opt_pp, prop_pp_in)%>%
mutate(opt_pp = paste0(opt_pp) %>% fct_inorder()) %>%
mutate(
predator = str_remove(predator, "_pred"),
prey = str_remove(prey, "_prey")
)
# Shut down the parallel plan
plan(sequential)
# Summary of results
summary(prop_ppsr) predator prey opt_pp prop_pp_in
Length:3645 Length:3645 c(5, 20):729 Min. :0.0000
Class :character Class :character c(4, 16):729 1st Qu.:0.0000
Mode :character Mode :character c(3, 12):729 Median :0.0107
c(2, 8) :729 Mean :0.1523
c(1, 4) :729 3rd Qu.:0.1724
Max. :1.0000
We can plot the distribution of the proportion of PPSR falling in the optimal range for each range value.
ggplot(prop_ppsr) +
geom_density(aes(x = prop_pp_in)) +
xlim(0, 1) +
labs(x = "Proportion in optimal PPSR", y = "Density") +
facet_wrap(~opt_pp, scales = "free", ncol = 1) +
theme_classic()We can also plot these values in a matrix.
Separate positive and negative interactions
By construction, the computed values are in the range [0; 1]. As for other matrices (distance-based and co-occurrence), it is desirable to separate between positive and negative interactions. To set this threshold, we can have a look at what happens when we randomly sample two series of 10,000 objects, compute 10,1000 PPSR values and see the proportion that falls into considered size ranges.
# Randomly sample 10,000 predators and prey
set.seed(1)
rand_pred <- plankton %>% slice_sample(n = 10000)
rand_prey <- plankton %>% slice_sample(n = 10000)
# Compute PPSR (predator / prey)
df_rand <- tibble(
pred_esd = rand_pred$esd,
prey_esd = rand_prey$esd
) %>%
mutate(ppsr = pred_esd / prey_esd)
summary(df_rand) # Median PPSR is 1 by definition pred_esd prey_esd ppsr
Min. :0.4110 Min. : 0.4110 Min. : 0.08014
1st Qu.:0.5429 1st Qu.: 0.5429 1st Qu.: 0.72156
Median :0.6736 Median : 0.6760 Median : 1.00330
Mean :0.7751 Mean : 0.7767 Mean : 1.15543
3rd Qu.:0.8574 3rd Qu.: 0.8555 3rd Qu.: 1.37437
Max. :7.2143 Max. :13.2760 Max. :15.35361
# Compute proportion of PPSR falling into each optimal range
df_rand_prop <- df_rand %>%
# cross with optimize ranges
crossing(
tibble(opt_pp = list(c(5, 20), c(4, 16), c(3, 12), c(2, 8), c(1, 4))) %>%
mutate(opt_label = paste0(opt_pp)) %>% # Preserve label
unnest_wider(opt_pp, names_sep = "") %>%
rename(min = opt_pp1, max = opt_pp2)
) %>%
# flag if ppsr is within the optimal range
mutate(in_pp = between(ppsr, min, max)) %>%
# compute proportion of ppsr in optimal range per range
group_by(opt_pp = fct_inorder(opt_label)) %>%
summarise(rand_prop = mean(in_pp), .groups = "drop")
df_rand_prop# A tibble: 5 × 2
opt_pp rand_prop
<fct> <dbl>
1 c(1, 4) 0.495
2 c(2, 8) 0.0883
3 c(3, 12) 0.0267
4 c(4, 16) 0.00955
5 c(5, 20) 0.00429
Interpretation:
in the size range c(1, 4) , 49.5% of PPSR fall in the optimal size range
in the size range c(3, 12) , 2.7% of PPSR fall in the optimal size range
in the size range c(5, 20) , 0.4% of PPSR fall in the optimal size range
Thus, for a given size range, for a given pair, if the proportion of PPSR is less than this random proportion, we can assume a negative predation interaction for this pair. On the opposite, if the proportion of PPSR is higher than the random proportion, we can assume a positive predation interaction. Let’s integrate these random proportion into the matrices to flag negative and positive predation interaction.
Then, we can use two different ways to correct the matrices:
ratio between the proportion of PPSR in the optimal range divided by the random proportion: values < 1 indicate negative interaction, values > 1 indicate positive interaction
difference between the proportion of PPSR in the optimal range and the random proportion: values < 0 indicate negative interaction, values > 0 indicate positive interaction
Let’s focus on difference now. First we compute it, and then we standardize it in the [-1; 1] range for each size range.
# Join computed PPSR proportions with random ones
prop_ppsr <- prop_ppsr %>%
left_join(df_rand_prop, by = join_by(opt_pp)) %>%
mutate(
size_int = prop_pp_in - rand_prop, # difference between computed and random prop
direction = ifelse(size_int > 0, "pos", "neg") # direction of interaction
)
# For each size range, standardize in the [-1; 1] range
prop_ppsr <- prop_ppsr %>%
group_by(opt_pp) %>%
mutate(size_int = (size_int - min(abs(size_int)))/(max(abs(size_int)) - min(abs(size_int)))) %>%
ungroup()Let’s plot computed matrices.
ggplot(prop_ppsr) +
geom_tile(aes(x = predator, y = prey, fill = size_int)) +
scale_fill_gradient2(low = "#ca0020", high = "#0571b0", na.value = "grey90") +
labs(x = "Predator", y = "Prey", fill = "Size\nint") +
coord_fixed() +
facet_wrap(~opt_pp, ncol = 2) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))Which optimal size-range should we choose?
find some metric within the size-based matrices?
find the size-range that best mimic other matrices, such as the distance-based one
Compare with distance-based matrix
Load the distance based matrix and process it.
load("data/07.distance_matrix.Rdata")
df_mat_ks <- df_mat_ks %>%
mutate(t1 = as.factor(t1), t2 = as.factor(t2))
# Plot our original matrix
ggplot(df_mat_ks) +
geom_tile(aes(x = t1, y = t2, fill = int_dist)) +
scale_fill_viridis_c(limits = c(0, 1), na.value = NA) +
labs(x = "Predator", y = "Prey", fill = "KS metric") +
coord_fixed() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))# First, we need to make the KS matrix symmetric
# For this, we create a second version of the matrix but we swap t1 and t2
df_mat_ks_sym <- df_mat_ks %>%
# swap column names using a temporary column
rename(tmp = t1, t1 = t2) %>%
rename(t2 = tmp) %>%
# reorganise columns
select(t1, t2, int_dist) %>%
# bind with the original matrix
bind_rows(df_mat_ks)
# Replace NA by zeros (i.e. no interaction)
df_mat_ks_sym <- df_mat_ks_sym %>% mutate(int_dist = replace_na(int_dist, 0))
# Plot again
ggplot(df_mat_ks_sym) +
geom_tile(aes(x = t1, y = t2, fill = int_dist)) +
scale_fill_viridis_c(limits = c(0, 1), na.value = NA) +
labs(x = "Predator", y = "Prey", fill = "KS metric") +
coord_fixed() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))# GoodLet’s now compare the size-based matrices and the distance-based one.
comp <- prop_ppsr %>%
# join size-based and distance-based
left_join(df_mat_ks_sym %>% rename(predator = t1, prey = t2), by = join_by(predator, prey), relationship = "many-to-many") %>%
# compate difference in both metrics
mutate(diff = size_int - int_dist)
comp %>%
ggplot() +
geom_abline(slope = 1, intercept = 0, colour = "red") +
geom_point(aes(x = prop_pp_in, y = int_dist)) +
coord_fixed() +
facet_wrap(~opt_pp, ncol = 2)Let’s now have a look at the differences.
# Plot the differences as matrices
ggplot(comp) +
geom_tile(aes(x = predator, y = prey, fill = diff)) +
#scale_fill_viridis_c(limits = c(0, 1)) +
scale_fill_gradient2(low = "#ca0020", high = "#0571b0", na.value = "grey90") +
labs(x = "Predator", y = "Prey", fill = "Diff (size - ks)") +
coord_fixed() +
facet_wrap(~opt_pp, ncol = 2) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))# Plot the distribution of differences
ggplot(comp, aes(x = diff, y = opt_pp)) +
geom_vline(xintercept = 0, colour = "grey", linewidth = 2) +
geom_boxplot() +
stat_summary(fun = mean, geom = "point", colour = "red", shape = 4, size = 3) +
labs(x = "Diff (size - ks)", y = "Consid. opt. PPSR") +
theme_classic()The optimal range seems to be c(3, 12). Let’s keep this matrix.
Symmetrize the matrix
Compared to the co-occurrence and distance-based matrix, the size-based matrix is not symmetrical: pairs A-B and B-A don’t have the same values. Let’s fix this by taking the max between A-B and B-A.
# Select the chosen size range
df_size <- prop_ppsr %>%
filter(opt_pp == "c(3, 12)") %>%
select(t1 = predator, t2 = prey, size_int)
# Symmetrize it
df_size <- df_size %>%
# build pair from t1 and t2 (same pair for t1 - t2 and t2 - t1)
mutate(pair = case_when(
t1 < t2 ~ paste(t1, t2, sep = " - "),
.default = paste(t2, t1, sep = " - ")
), .before = t1) %>%
group_by(pair) %>%
# take the max by pair
summarise(size_int = max(size_int), .groups = "drop") %>%
# retrieve t1 and t2
separate(pair, into = c("t1", "t2"), sep = " - ")
# Plot result
ggplot(df_size) +
geom_raster(aes(x = t1, y = t2, fill = size_int)) +
labs(x = "t1", y = "t2", fill = "Int") +
coord_fixed() +
scale_fill_gradient2(na.value = NA, low = "#ca0020", high = "#0571b0") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))All good, save it!
Save
save(df_size, file = "data/16.size_matrix.Rdata")